These exercises are about manipulate single-cell data with Seurat. Please download the counting matrix from BOX and loading them into a Seurat object. Or you may also used the rds file data/scSeq_CKO_sceSub.rds.
Exercise 2 - Data manipulation with Bioconductor packages
bcrank <- barcodeRanks(counts(sce))
uniq <- !duplicated(bcrank$rank)
#
plot(bcrank$rank[uniq], bcrank$total[uniq], log="xy",
xlab="Rank", ylab="Total UMI count", cex.lab=1.2)## Warning in xy.coords(x, y, xlabel, ylabel, log): 1 y value <= 0 omitted from
## logarithmic plot
abline(h=metadata(bcrank)$inflection, col="darkgreen", lty=2)
abline(h=metadata(bcrank)$knee, col="dodgerblue", lty=2)
legend("bottomleft", legend=c("Inflection", "Knee"),
col=c("darkgreen", "dodgerblue"), lty=2, cex=1.2)## DataFrame with 400000 rows and 5 columns
## Total LogProb PValue Limited FDR
## <integer> <numeric> <numeric> <logical> <numeric>
## CCACGGAAGCTCTCGG-1 0 NA NA NA NA
## TGTGGTAAGAGTCGGT-1 2 -11.1237 0.7561244 FALSE 0.99174
## CCCTCCTTCGTCTGCT-1 2 -18.2572 0.0727927 FALSE 0.66022
## ACTGAGTGTGTCGCTG-1 0 NA NA NA NA
## CGTAGCGTCTCTGCTG-1 0 NA NA NA NA
## ... ... ... ... ... ...
## AGAGCTTGTCCGACGT-1 0 NA NA NA NA
## GCTGCGACAATAGCAA-1 0 NA NA NA NA
## CGAACATAGTGAAGTT-1 2 -12.3017 0.632437 FALSE 0.99174
## TAGCCGGCATGTCGAT-1 0 NA NA NA NA
## CGAGAAGAGCAGATCG-1 0 NA NA NA NA
## Mode FALSE TRUE NA's
## logical 211696 2110 186194
## Limited
## Sig FALSE TRUE
## FALSE 209919 1777
## TRUE 1 2109
library(scran)
library(scuttle)
library(scater)
clusters <- quickCluster(sce2)
sce2 <- computeSumFactors(sce2, cluster=clusters)
sce2 <- logNormCounts(sce2)
#
set.seed(1000)
# modeling variables
dec.pbmc <- modelGeneVarByPoisson(sce2)
# calcualte top features
top.pbmc <- getTopHVGs(dec.pbmc, prop=0.1)
#
set.seed(1000)
# Evaluate PCs
sce2 <- denoisePCA(sce2, subset.row=top.pbmc, technical=dec.pbmc)
# make UMAP plot
sce2 <- runUMAP(sce2, dimred="PCA")
#
g <- buildSNNGraph(sce2, k=10, use.dimred = 'PCA')
clust <- igraph::cluster_walktrap(g)$membership
colLabels(sce2) <- factor(clust)
#
plotUMAP(sce2,colour_by="label")# evaluate ambinat RNA contamination in the empty droplets
amb <- metadata(e.out)$ambient[,1]
head(amb)## ENSMUSG00000051951 ENSMUSG00000025902 ENSMUSG00000033845 ENSMUSG00000025903
## 1.669575e-07 1.669575e-07 1.407972e-04 5.201547e-05
## ENSMUSG00000104217 ENSMUSG00000033813
## 1.669575e-07 6.783772e-05
#
# remove ambient RNA
library(scater)
stripped <- sce2[names(amb),]
out <- removeAmbience(counts(stripped), ambient=amb,groups = colLabels(stripped))
#
# load correccted counts into scce object
counts(stripped, withDimnames=FALSE) <- out
stripped <- logNormCounts(stripped)
#
ensmbl_id <- rowData(sce2)$ID[rowData(sce2)$Symbol=="Hba-a1"]
plotExpression(sce2, x="label", colour_by="label", features=ensmbl_id) +
ggtitle("Before")set.seed(1000)
dec <- modelGeneVar(stripped)
hvgs <- getTopHVGs(dec,n=1000)
stripped <- runPCA(stripped, ncomponents=10, subset_row=hvgs)
stripped <- runUMAP(stripped, dimred="PCA")
g <- buildSNNGraph(stripped, k=10, use.dimred = 'PCA')
clust <- igraph::cluster_walktrap(g)$membership
colLabels(stripped) <- factor(clust)
plotUMAP(stripped,colour_by="label")dbl.dens <- computeDoubletDensity(stripped, #subset.row=top.mam,
d=ncol(reducedDim(stripped)),subset.row=hvgs)
summary(dbl.dens)## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0422 0.3840 0.7385 0.9630 1.3082 7.2457
#
plotColData(stripped, x="label", y="DoubletScore", colour_by="label")+
geom_hline(yintercept = quantile(colData(stripped)$DoubletScore,0.95),lty="dashed",color="red")#
cut_off <- quantile(stripped$DoubletScore,0.95)
stripped$isDoublet <- c("no","yes")[factor(as.integer(stripped$DoubletScore>=cut_off),levels=c(0,1))]
table(stripped$isDoublet)##
## no yes
## 2004 106
library(scater)
mtGene <- rowData(sce_clean)$ID[grepl(rowData(sce_clean)$Symbol,pattern = "mt-")]
is.mito <- names(sce_clean) %in% mtGene
sce_clean <- addPerCellQC(sce_clean, subsets=list(Mito=is.mito))
#
plotColData(sce_clean,x="label", y="sum", colour_by="label")+ggtitle("read counts")plotColData(sce_clean,x="label", y="subsets_Mito_percent", colour_by="label")+ggtitle("mitocondrial content")#
plotColData(sce_clean,x="sum",y="subsets_Mito_percent",colour_by="label")+ggtitle("is.mito vs read counts")vars <- getVarianceExplained(sce_clean,
variables=c("DoubletScore","label","sum","detected","subsets_Mito_percent"))
plotExplanatoryVariables(vars)## Warning: Removed 2785 rows containing non-finite values (stat_density).